{- 

    Copyright © 2011 - 2021, Ingo Wechsung
    All rights reserved.

    Redistribution and use in source and binary forms, with or
    without modification, are permitted provided that the following
    conditions are met:

        Redistributions of source code must retain the above copyright
        notice, this list of conditions and the following disclaimer.

        Redistributions in binary form must reproduce the above
        copyright notice, this list of conditions and the following
        disclaimer in the documentation and/or other materials provided
        with the distribution. Neither the name of the copyright holder
        nor the names of its contributors may be used to endorse or
        promote products derived from this software without specific
        prior written permission.

    THIS SOFTWARE IS PROVIDED BY THE
    COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
    IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
    PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER
    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
    USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
    AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
    IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
    THE POSSIBILITY OF SUCH DAMAGE.

     -}
{--
 * This is the scanner for the frege compiler.
 *
 * Essentially, there are the following important functions:
 * - the 'lex' function turns a list of strings into a list of 'Token's.
 * - the 'layout' function takes the output from 'scan' and inserts
 *   braces and semicolons according to layout rules.
 * - the 'substAllOp' functions scans the output of the scan function and
 *   replaces sequences of single characters with operator symbols according
 *   to a table.
 *
 * The 'lex' function has no way to know what operators are defined in the
 * current source file or in imported packages. In previous versions of the
 * compiler, this has been solved by calling import and updating the operator tree
 * via a reference as soon as the parser reduced an "import" or "infix" declaration.
 * Nowadays, we
 * 0. build an initial operator table from the Prelude
 * 1. scan without knowledge of the operators,
 * 2. do the layout,
 * 3. look for fixity definitions and imports
 * 4. complete the operator table
 *    (which causes 'IO' actions in case of imports)
 * 5. substitute characters with operators
 *
 * The resulting token list is ready to be passed to the parser.
 * This way, parser and scanner are decoupled and are pure functions,
 * with the exception of the part that builds the operator table.
 *
 -}



package frege.compiler.grammar.Lexer where

import frege.Prelude hiding(<+>)


-- import of library packages


import Data.TreeMap as TM(TreeMap, keys, contains, fromKeys, each, member)
import Control.monad.State
-- import Data.List as DL(partitioned, takeUntil)

-- import of compiler packages
       
import Lib.PP       except (group, layout, break)


import Compiler.enums.TokenID

import  Compiler.types.Positions hiding(is)
import  Compiler.types.Tokens 
-- import  Compiler.types.Packs
import  Compiler.types.Global as G

import  Compiler.common.Errors as E()
-- import  Compiler.common.Resolve as R(enclosed)

type CharSeq = CharSequence     -- from Java.Lang

{--
    [usage] @slurp filename encoding@
    [return] the content of text file @filename@ appropriately decoded according to the files @encoding@
-}
native slurp frege.runtime.CompilerSupport.slurp :: String -> String -> IO String
    throws  UnsupportedEncodingException, IOException 

{--
 * A map of keywords to 'TokenID's
 -}
kwtree = TM.fromList [
    ("package", PACKAGE),
    ("module", PACKAGE),
    ("import" ,  IMPORT),
    ("native" ,  NATIVE),
    ("foreign", NATIVE),        -- for the time being
    ("if" ,  IF),
    ("then" ,  THEN),
    ("else" ,  ELSE),
    ("class" ,  CLASS),
    ("interface" ,  CLASS),
    ("where" ,  WHERE),
    ("instance" ,  INSTANCE),
    ("of" ,  OF),
    ("default", DEFAULT),
    ("derive" ,  DERIVE),
    ("deriving" ,  DERIVE),
    ("data" ,  DATA),
    ("newtype" ,  NEWTYPE),
    ("extends" ,  EXTENDS),
    ("case" ,  CASE),
    ("let" ,  LET),
    ("in" ,  IN),
    ("mutable", MUTABLE),
    ("super",  SUPER),
    ("type" ,  TYPE),
    ("true" ,  TRUE),
    ("false" ,  FALSE),
    ("protected" ,  PROTECTED),
    ("private" ,  PRIVATE),
    ("public" ,  PUBLIC),
    ("pure",     PURE),
    ("abstract", ABSTRACT),
    ("do" ,  DO),
    ("forall" ,  FORALL),
    ("throws" ,  THROWS),
    -- ("break" ,  BREAK),
    -- ("while" ,  WHILE),
    ("infix" ,  INFIX),
    ("infixl" ,  INFIXL),
    ("infixr" ,  INFIXR)];


{--
    checks if a user defined operator obeys certain rules:
    - it must not be one of "=" "|" "," ";" "." "\\" "_" "!" "?" "-"
    - it must not be one of "::" "<-" "->" or "=>"
    - it must not contain braces, square brackets or parentheses
    - it must not contain one of the quoting characters " \ ` or ´
    - it must consist of either all word characters (apostrophes are allowed but not as the first character)
      or all non word characters (except apostrophes which are not allowed)
 -}

validop "_" = false
validop ´[\(\[\{\}\]\)]´ = false
validop ´["`\´]´ = false
validop s   
    | member s specialT = false
    | s ~ ´^[\w']+$´    = true
    | s ~ ´^\W+$´       = s !~ ´'´
    | otherwise         = false


{--
    tells if character is forbidden in operator

    The following are forbidden in operators: comma, semicolon, 
    grave accent mark, apostrophe, double quote, acute accent mark,
    parentheses, braces, brackets, underscore, letters, digits and whitespace. 
 -}
forbidden ','  = true
forbidden ';'  = true
forbidden '´'  = true   -- ´
forbidden '"'  = true   -- "
forbidden '\'' = true
forbidden '`'  = true
forbidden '('  = true
forbidden ')'  = true
forbidden '['  = true
forbidden ']'  = true
forbidden '{'  = true
forbidden '}'  = true
forbidden '_'  = true
-- forbidden '\\' = true
forbidden ch   = ch.isLetterOrDigit || ch.isWhitespace 



--- check whether 'Token' is a specific char
is :: Token -> Char -> Bool
is t c = t.tokid == CHAR && t.value.charAt 0 == c

--- check whether 'Token' is not a specific char
isNot :: Token -> Char -> Bool
isNot t c = t.tokid != CHAR || t.value.charAt 0 != c



{--
    This function does the layout on a list of
    'Token's. The result is another list
    of 'Token's with some extra semicolons
    and braces in the correct places.

    The first argument is the context represented by a list of integers,
    where each element is either:
    - Zero, indicating that the enclosing context is explicit
     (i.e. the programmer supplied the opening brace).
     If the innermost context is 0, then no layout tokens will be inserted until
     either the enclosing context ends or a new context is pushed.
    - A positive integer, which is the indentation column of the enclosing layout context.

    The _indentation_ of a token is the column number indicating the start of that token;
    the indentation of a line is the indentation of its leftmost lexeme.
    To determine the column number, assume a fixed-width font. For the purposes
    of the layout rule, Unicode characters in a source
    program are considered to be of the same, fixed, width as an ASCII character.
    The first column is designated column 1, not 0.

    The layout is done according to the following rules:

-}
layout :: [Int] -> [Token] -> [Token]

---  1) an explicit \'{\' starts a new explicit context
layout !ctx (t1:ts)
    | t1 `is` '{' = t1 : layout (0:ctx) ts

---  2) an explicit \'}\' can only occur in explicit context and closes this context
layout (0:ms) (t1:ts)
    | t1 `is` '}', t1.col > 0  = t1:layout ms (t1.{tokid=COMMENT}:ts) 

{--
    3) if a *@let@*, *@do@*, *@where@* or *@of@* is not followed by \'{\'
       and the position of the next token is greater than the
       current context, insert \'{\' and push that position as new context.
       If the position is the same, insert \'{\' \'}\'

    4) If the position of the first token on a line matches the context,
       a \';\' is inserted before that token, except when the last token
       on the last line was already a semicolon.

    5) If the position of the first token on a line is less than the context,
       the context is closed and a closing brace is inserted.

    6) If *@in@* is found in layout mode
       without preceding closing brace, the closing brace is inserted
       and the context is closed

    7) At the end of the program, if there are open layout contexts,
       a corresponding number of closing braces is inserted.
-}
layout (m:ms) (t1:t2:ts)
    | kw t1.tokid, t2 `isNot` '{', t2.col > m
    = t1 : Token CHAR "{" t1.line 0 (t1.offset+t1.value.length) [] : layout (t2.col:m:ms) (t2:ts)
    | kw t1.tokid, t2 `isNot` '{', t2.line > t1.line, t2.col <= m, m>0
    = t1 : Token CHAR "{" t1.line 0 (t1.offset+t1.value.length) [] 
         : Token CHAR "}" t1.line 0 (t1.offset+t1.value.length+1) [] : 
         if t2 `isNot` ';' && t2.col == m
            then Token CHAR ";" t1.line 0 (t1.offset+t1.value.length+2) [] : layout (m:ms) (t2:ts) 
            else layout (m:ms) (t2:ts)
    | t2.line > t1.line, t2.col == m, t1 `isNot` ';'
    = t1 : Token CHAR ";" t1.line 0 (t1.offset+t1.value.length) [] : layout (m:ms) (t2:ts)
    | t2.line > t1.line, t2.col < m
    = t1 : layout ms (Token CHAR "}" t1.line 0 (t1.offset+t1.value.length) [] : t2 : ts)
    | m != 0, t2.tokid == IN, t1 `isNot` '}' && not (t1.tokid == COMMENT && t1.value == "}")
    = t1 : Token CHAR "}" t1.line 0 (t1.offset+t1.value.length) [] : layout ms (t2:ts)
    where
        kw LET = true; kw DO  = true; kw WHERE = true; kw OF = true; kw _ = false

layout ms (t1:ts) = t1:layout ms ts
layout [0] []     = []              -- proper end.
layout []  []     = []              -- proper end.
layout (m:ms) []
    | m > 0 = Token CHAR "}" Int.maxBound 0 Int.maxBound [] : layout ms []
    | otherwise = layout ms []    -- explicit brace missing

layout ms ts =
    traceLn ("layout " ++ show ms ++ "   " ++ show (take 3 ts)) `seq` []

-- it is in the prelude meanwhile
-- infixr 13 `!:`
-- !a !: as = a : as

{--
    Scan a 'CharSeq' and take care of offsets
-}
lex :: CharSeq -> Int -> Int -> Int -> [Token]
lex !cs !line !col !start
    | endOfSeq        = []
    | ch == '\n'      = lex cs (line+1) 1   (start+1)
    | ch.isWhitespace = lex cs line (col+1) (start+1)
    | ch.isUpperCase  = case ident (start+1) of
                            !end
                                | at end == '.' = Token QUALIFIER (seq end) line col start [] !: lex cs line (col+end-start+1) (end+1)
                                | otherwise     = Token CONID     (seq end) line col start [] !: lex cs line (col+end-start) end
    -- everything that is not an uppercase letter is treated as lowercase
    | identStart      = case ident (start+1) of
                        !end -> case Token VARID (seq end) line col start [] of
                            tok -> case TreeMap.lookupS kwtree tok.value of
                                Just f  -> tok.{tokid = f} !: lex cs line (col+end-start) end
                                Nothing -> tok             !: lex cs line (col+end-start) end
    | ch == '0', at (start+1) == 'x' || at (start+1) == 'X', hexdigit (at (start+2))
                      = hexNumber    (start+2)
    | digit ch        = integer      (start+1)
    | ch == '{'       = commentStart (start+1)
    | ch == '-', at (start+1) == '-'       = commentStart (start+1)
    | ch == '\''      = lexQuoted    (start+1)
    | ch == '"'       = lexQuoted    (start+1) -- "
    -- ch == '#'       = lexQuoted    (start+1) -- #
    | ch == '´'       = lexQuoted    (start+1)
    | ch == '`'       = lexQuoted    (start+1)
    | ch.isSurrogatePair (at (start+1)) = case cs.codePointAt start of
        cp  | Char.isUpperCase cp = case ident (start+2) of
                end
                    | at end == '.' = Token QUALIFIER (seq end) line col start [] !: lex cs line (col+end-start+1) (end+1)
                    | otherwise     = Token CONID     (seq end) line col start [] !: lex cs line (col+end-start) end
            -- everything that is not an uppercase letter is treated as lowercase
            | Char.isLetter cp = case ident (start+2) of
                end ->  case Token VARID (seq end) line col start [] of
                                tok -> case TreeMap.lookupS kwtree tok.value of
                                    Just f  -> tok.{tokid = f} !: lex cs line (col+end-start) end
                                    Nothing -> tok             !: lex cs line (col+end-start) end
            | otherwise  = lexOp (start+2)
    | not (forbidden ch) = lexOp (start+1)
    | otherwise       = Token CHAR (ctos ch) line col start [] !: lex cs line (col+1) (start+1)
    where
        identStart  = ch.isLetter || ch == '_'
        endOfSeq    = start >= cs.length      -- assume that length is a cheap operation on char sequences
        !ch         = if endOfSeq then '\0' else cs.charAt start
        seq end     = (cs.subSeq start end).toString
        digit ch    = ch >= '0' && ch <= '9'
        hexdigit ch = digit ch || ch >= 'a' && ch <= 'f' || ch >= 'A' && ch <= 'F'
        at i        = if i < cs.length then cs.charAt i else '\0'
        -- parse sequence of operator characters
        lexOp end
            | end+1 < cs.length,
              ch.isSurrogatePair (at (end+1)),
              cp <- cs.codePointAt end,
              not (Char.isLetter cp) = lexOp (end+2)
            | forbidden ch || end >= cs.length || ch.isSurrogatePair (at (end+1))
                =  case specialT.lookupS oper.value of
                    Just t  -> oper.{tokid=t} !: rest
                    Nothing -> oper !: rest 
            | otherwise = lexOp (end+1) 
            where 
                !ch = at end
                oper = Token SOMEOP (seq end) line col start []
                rest = lex cs line (col+end-start) end 
        -- parse a quoted construct
        lexQuoted end
            | ch == '\\',
              end+1 < cs.length       = lexQuoted (end+2)
            | end >= cs.length        = [Token LEXERROR (seq end) line col start []]
            | ch == '\n'              = Token LEXERROR (seq end) line col start [] !: lex cs line (col+end-start) end
            | ch != at start          = lexQuoted (end+1)
            | otherwise               = case at start of
                '"'  -> Token STRCONST (seq (end+1)) line col start [] !: lex cs line (col+end-start+1) (end+1) -- "
                '\'' -> Token CHRCONST (seq (end+1)) line col start [] !: lex cs line (col+end-start+1) (end+1)
                -- '#'  -> Token REGEXP   inner line col start [] !: lex cs line (col+end-start+1) (end+1) -- #
                '´'  -> Token REGEXP   inner line col start [] !: lex cs line (col+end-start+1) (end+1)
                _    -> Token someop   inner line col (start+1) [] !: lex cs line (col+end-start+1) (end+1)
            where
                !ch = at end
                inner = (cs.subSeq (start+1) end).toString
                someop = if validop inner then SOMEOP else LEXERROR
        -- parse a hexadecimal number "
        hexNumber end
            | hexdigit ch             = hexNumber (end+1)
            | ch == 'l' || ch == 'L'  = Token LONGCONST (seq (end+1)) line col start [] !: lex cs line (col+end-start+1) (end+1)
            | otherwise               = Token INTCONST  (seq end)     line col start [] !: lex cs line (col+end-start)   end
            where
                !ch       = at end

        -- parse a number
        integer end
            -- end >= cs.length       = [num]
            | digit ch               = integer (end+1)
            | ch == '_',
              digit (at (end+1)) && digit (at (end+2)) && digit (at (end+3)) && not (digit (at (end+4)))
                                     = integer (end+4)
            | ch == 'l' || ch == 'L' = Token LONGCONST (seq (end+1)) line col start [] !: lex cs line (col+end-start+1) (end+1)
            | ch == 'n' || ch == 'N' = Token BIGCONST  (seq (end+1)) line col start [] !: lex cs line (col+end-start+1) (end+1)
            | ch == 'f' || ch == 'F' = Token FLTCONST  (seq (end+1)) line col start [] !: lex cs line (col+end-start+1) (end+1)
            | ch == 'd' || ch == 'D' = Token DBLCONST  (seq (end+1)) line col start [] !: lex cs line (col+end-start+1) (end+1)
            | ch == 'z' || ch == 'Z' = Token DECCONST  (seq (end+1)) line col start [] !: lex cs line (col+end-start+1) (end+1)
            | ch == '.',
              digit (at (end+1))     = floatPart (end+1)
            
            | ch == 'e' || ch == 'E',
              digit (at (end+1))     = floatPart2 (end+1)
            | ch == 'e' || ch == 'E',
              at (end+1) == '+' || at (end+1) == '-',
              digit (at (end+2))     = floatPart2 (end+2)
            | otherwise              = num !: lex cs line (col+end-start) end
            where
                num       = Token INTCONST (seq end) line col start []
                !ch       = at end
                -- parse the first floating part, pointer is at first digit after .
                floatPart end
                    | digit ch                                  = floatPart (end+1)
                    | ch `elem` ['d', 'D', 'f', 'F', 'z', 'Z']  = integer end
                    | ch == 'e' || ch == 'E',
                      at (end+1) == '+' || at (end+1) == '-',
                      digit (at (end+2))                        = floatPart2 (end+2)
                    | ch == 'e' || ch == 'E',
                      digit (at (end+1))                        = floatPart2 (end+1)
                    | otherwise                                 = Token DBLCONST  (seq end) line col start []!: lex cs line (col+end-start) end
                    where
                        ch        = at end
                -- parse the second floating part, pointer is at first digit after "e+", "e-" or "e"
                -- see if we can get a documentation text
                floatPart2 end
                    | digit ch                                  = floatPart2 (end+1)
                    | ch `elem` ['d', 'D', 'f', 'F', 'z', 'Z']  = integer end
                    | otherwise                                 = Token DBLCONST  (seq end) line col start [] !: lex cs line (col+end-start) end
                    where
                        ch        = at end
        commentStart end
            | at end == '-' = lexComment 0 (at start == '{') proto cs line (col+2) (start+2)
            | otherwise     =  brace !: rest
            where
                brace = Token {tokid = CHAR, line, col, offset = start, value = ctos (at start), qual=[]}
                rest  = lex cs line (col+1) end
                proto = Token DOCUMENTATION "" line col start []
        ident end
            | (at end).isLetterOrDigit || at end == '_' || at end == '\'' = ident (end+1)
            | (at end).isSurrogatePair (at (end+1)),
              cp <- cs.codePointAt end,
              Char.isLetter cp || Char.isDigit cp = ident (end+2)
            | otherwise = end



lexComment :: Int -> Bool -> Token -> CharSeq -> Int -> Int -> Int -> [Token]
lexComment !nest !block !proto !cs !line !col !i
    | i >= cs.length = if block
                        then [proto.{tokid=LEXERROR, value = (cs.subSeq proto.offset i).toString}]
                        else if at (proto.offset+2) == '-'
                            then [proto.{value = "   "}]
                            else [proto.{tokid = COMMENT, value = "  "}]
    | block,
      at i     == '-',
      at (i+1) == '}'
                     =  if nest == 0
                        then if at (proto.offset+2) == '-'
                            then proto.{value = subseq i ++ "     "}                  !: lex cs line (col+2) (i+2)
                            else proto.{tokid = COMMENT, value = subseq i ++ "     "} !: lex cs line (col+2) (i+2)
                        else lexComment (nest-1) block proto cs line (col+2) (i+2)
    | block,
      at i     == '{',
      at (i+1) == '-'
                     =  lexComment (nest+1) block proto cs line (col+2) (i+2)
    | block,
      at i == '\n'   =  lexComment nest block proto cs (line+1) 1        (i+1)
    | at i == '\n'   =  if at (proto.offset+2) == '-'
                            then proto.{value = subseq i ++ "   "}                  !: lex cs line col i
                            else if i == proto.offset+2         -- --\n do NOT look further
                            then proto.{tokid = COMMENT, value = "  "} !: lex cs line col i
                            else proto.{tokid = COMMENT, value = subseq i ++ "   "} !: lex cs line col i
    | otherwise      =  lexComment nest block proto cs line     (col+1)  (i+1)
    where
        at n = if n >= cs.length then '\0' else cs.charAt n
        subseq n = (cs.subSeq (proto.offset+3) n).toString


--- special symbols in tree
specialT = TM.fromList [("::", DCOLON), ("∷", DCOLON), 
                            ("..", DOTDOT), ("…", DOTDOT),
                            ("=>", EARROW), ("⇒", EARROW),
                            ("->", ARROW),  ("→", ARROW),
                            ("<-", GETS),   ("←", GETS),
                            ("∀", FORALL),
                            ("≤", EXTENDS), ("≥", SUPER),
                            -- magic characters that are not operators
                            (".", CHAR), ("=", CHAR),
                            ("-", CHAR), ("|", CHAR),
                            ("?", CHAR), ("!", CHAR),
                            ("\\", CHAR),
                        ]


{--
    Replace '.' with '•' where appropriate
    
    1. if @.@ appears after a @(@
    2. if @.@ appears before a @)@
    3. if @.@ is enclosed in whitespace
-}    
substDot (p:d:n:ts) 
    | is d '.', 
      is p '(' ||                       -- (.          looks like a section
      is n ')' ||                       -- .)          looks like a section
      not (p.vor d) && not (d.vor n)    -- foo . bar   probably function application 
        = p !: substDot (d.{value="•", tokid=SOMEOP} !: n !: ts)
    | otherwise = p !: substDot (d:n:ts)
substDot not3 = not3    -- less than 3 tokens

{--
    Re-categorize keywords 'MUTABLE' and 'PURE' as 'VARID'
    unless it is followed by 'NATIVE' -}
substKW :: [Token] -> [Token]
substKW (p:n:ts)
    | p.tokid == MUTABLE || p.tokid == PURE,
      n.tokid != NATIVE
    = p.{tokid = VARID} !: substKW (n:ts)
    | otherwise = p !: substKW (n:ts)
substKW [p]  
    | p.tokid == MUTABLE || p.tokid == PURE
    = [p.{tokid = VARID}]
substKW other = other 

{--
    Make
    
    > QUALIFIER QUALIFIER SOMEOP
    > QUALIFIER SOMEOP
    
    look like a single token.
    
    This simplifies the grammar and lets us parse binary expressions
    with a lookahead of 1.
    
    Whenever an unqualified operator is required, 
    use 'frege.compiler.common.Desugar#unqualified' to emit an error.
     
    -}
substQQ :: [Token] -> [Token]
substQQ (q1:q2:n:xs)
    | q1.tokid == QUALIFIER,
      q2.tokid == QUALIFIER,
      n.tokid == SOMEOP
    = n.{qual = [q1,q2]} !: substQQ xs
    | q1.tokid == QUALIFIER,
      q2.tokid == SOMEOP
    = q2.{qual = [q1] } !: substQQ (n:xs)
substQQ (t:ts) = t : substQQ ts
substQQ []     = []

--- this is the lexical analysis pass
pass :: StIO [Token]
pass =
    do
        global <- getSTT
        let opts   = global.options

        fdata <- liftIO (slurp opts.source (maybe "utf-8" id opts.encoding) >>= return . Right
                        `catch` cantread opts.source
                        `catch` badencoding)
        liftStG do
            case fdata of
                Left exc -> do
                    E.error Position.null (msgdoc exc)
                    stio []
                Right string -> passCS (string2cs string)
  where
    cantread :: String -> IOException -> IO (String|String)
    cantread src exc = (return . Left) ("Can't read " ++ src ++ ": " ++ exc.getMessage)
    badencoding ::  UnsupportedEncodingException -> IO (String|String)
    badencoding exc = (return . Left) exc.getMessage

{--
    This is the entry point for lexical analysis of a program contained in an immutable @java.lang.CharSequence@
-}
passCS :: CharSeq  -> StG [Token]
passCS cs  =
    do
        -- g <- getST
        -- let prefix  =  g.options.prefix
        changeST Global.{sub <- SubSt.{code = cs}}
        changeST Global.{sub <- SubSt.{toks = arrayFromList result}}
        return result

    where
        result = lexer cs

lexer cs = merge comments  tokens
    where
        !lexed = lex cs 1 1 0
        !comments    =  filter Token.isComment lexed
        !nocomments  =  filter Token.noComment lexed 
        !first = case filter ((!=DOCUMENTATION) . Token.tokid) nocomments of
                (t1:t2:_) | t1.tokid==PROTECTED, t2.tokid==PACKAGE = t2
                (t:_) → t
                []    → Token PACKAGE "module" 1 1 0 []
        -- supply a starting '{' when module is missing and it was implicit mode
        openingbrace ts = case first.tokid of
            PACKAGE → ts
            CHAR | first.value.charAt 0 == '{' = ts
            _       → brace : ts
        !layoutmode = case first.tokid of
            PACKAGE → 0                                 -- start explicit
            CHAR | first.value.charAt 0 == '{' = 0      -- start explicit
            _       → 1                                 -- implicit at col 1
        !tokens    = (openingbrace . substKW . layout [layoutmode] . substDot) nocomments

private !brace = Token{tokid=CHAR, value="{", line=1, col=0, offset=0, qual=[]}

protected merge :: [Token] -> [Token] -> [Token]
protected merge [] b = b
protected merge (ass@a:as) (bss@b:bs)
    | a.offset < b.offset = a : merge as bss
    | otherwise           = b : merge ass bs
protected merge a _ = a


{-- 
    The following helper should avoid the message

    > save\frege\compiler\Scanner.java:3721: illegal start of expression
    >    return (java.lang.CharSequence)(frege.runtime.Delayed.<java.lang.String>forced(arg$1));
    >                                                          ^ 

    issued by buggy javac6 compilers.
    -}
string2cs !s = CharSeq.fromString s


--- test
main [fileortext] = do
        fdata <- slurp fileortext "utf-8"
            `catch` (\(t::Throwable) -> return fileortext)
        -- let r = evalState (passCS (string2cs fdata))
        println (lexer (string2cs fdata) )
        return ()
main xs = main [joined "\n" xs]
